Created in June 2019 - Updated in August 2019
Goal: we want to obtain info about all patents citing publications from a given university, in this case, Kent State University (KSU).
import time
#
import pandas as pd
from pandas.io.json import json_normalize
import plotly_express as px
#
from plotly.offline import init_notebook_mode # needed for exports
init_notebook_mode(connected=True)
#
import dimcli
from dimcli.shortcuts import dslquery, dslqueryall, chunks_of, normalize_key
dimcli.login()
GRIDID = "grid.258518.3"
YEAR_START = "2009"
# Group publications by year (will be useful later to create a histogram)
df_all_pubs_per_year = dslquery(f"""search publications where research_orgs.id="{GRIDID}"return year limit 1000""").as_dataframe()
# Get full list of reelvant publications linked to this organization
pubs_details = dslqueryall(f"""search publications where research_orgs.id="{GRIDID}" and year>="{YEAR_START}" return publications[basics+FOR]""")
df_pubs_details = pubs_details.as_dataframe()
# Create a new list of publications with simplifed FOR data
# Ensure that all pubs have a valid (empty, even) FOR value, also remove the FOR digit prefix to improve legibility
for x in pubs_details.publications:
if not 'FOR' in x:
x['FOR'] = ""
else:
x['FOR'] = [{'name' : x['name'][5:]} for x in x['FOR']]
df_pubs_for = json_normalize(pubs_details.publications, record_path=['FOR'], meta=["id", "type", ["journal", "title"], "year"], errors='ignore', record_prefix='for_')
# Extract all patents linked to the publications dataset previously created
# Note: the patents query goes like this: `search patents where publication_ids in ["pub.1111511314", ...] return patents`
BATCHSIZE = 400
def run(ids_list):
patents_out, n = [], 0
for chunk in chunks_of(ids_list, BATCHSIZE): # chunks of 200 args
n += 1
temp = ','.join(['"{}"'.format(i) for i in chunk])
data = dslquery(f"""search patents where publication_ids in [{temp}] return patents[basics+publication_ids+FOR] limit 1000""")
patents_out += data.patents
print("[log] ", n*BATCHSIZE, " pubs > patents: ", len(data.patents))
time.sleep(1)
return patents_out
patents_list = run(list(df_pubs_details['id']))
After going through all publications and extracting related patents, let's save the patents data so that we can use it later:
df_patent_details = pd.DataFrame().from_dict(patents_list)
# save to CSV
df_patent_details.to_csv("data/KSU_patents_by_id.csv")
# display top 3 rows
df_patent_details.head(3)
# Normalise the assignees data in order to analyse it further later on
normalize_key('assignees', patents_list)
# build a dataframe
df_patents_assignees = json_normalize(patents_list, record_path=['assignees'], meta=['id', 'year', 'title'], meta_prefix="grant_")
# save to CSV
df_patents_assignees.to_csv("data/KSU_patents_by_assignees.csv")
df_patents_assignees.head()
# Normalize FOR codes as above
for x in patents_list:
if not 'FOR' in x:
x['FOR'] = ""
else:
x['FOR'] = [{'name' : x['name'][5:]} for x in x['FOR']]
df_patents_for = json_normalize(patents_list, record_path=['FOR'], meta=["id", "year", "title"], errors='ignore', record_prefix='for_')
# save to CSV
df_patents_for.to_csv("data/KSU_patents_by_FOR.csv")
df_patents_for.head()
Finally, let's create another publications index, including only the KSU publications cited by the patents extracted above.
# extract pubs from patents list
pubs_referenced_from_patents = []
for x in patents_list:
if x['publication_ids']:
pubs_referenced_from_patents += x['publication_ids']
# remove duplicates
pubs_referenced_from_patents = list(set(pubs_referenced_from_patents))
len(pubs_referenced_from_patents)
Intersect list of publications from KSU with list of publications mentioned in patents
df_linked_pubs = df_pubs_details[df_pubs_details['id'].isin(pubs_referenced_from_patents)]
df_linked_pubs.reset_index(drop=True)
# save to CSV
df_linked_pubs.to_csv("data/KSU_pubs_linked_to_patents.csv")
df_linked_pubs.head()
Also, create a version of df_linked_pubs with simplified FOR codes so that it's easier to visualise.
df_linked_pubs_for = df_pubs_for[df_pubs_for['id'].isin(pubs_referenced_from_patents)]
df_linked_pubs_for.reset_index(drop=True)
df_linked_pubs_for.head()
px.bar(df_all_pubs_per_year, x="id", y="count", title="Total Publications per year from KSU")
px.scatter(df_pubs_for, x="year", y="for_name", color="type", hover_name="for_name", marginal_x="histogram", marginal_y="histogram", height=800, title="Research areas of pubs in last 10 years (marginal subplots = X/Y totals)")
px.bar(df_linked_pubs.groupby('year', as_index=False).count(), x="year", y="id", title="Publications mentioned in patents, by year of publication")
px.scatter(df_linked_pubs_for, x="year", y="for_name", color="type", hover_name="for_name", marginal_x="histogram", marginal_y="histogram", height=800, title="Research areas of pubs mentioned in patents (marginal subplots = X/Y totals)")
px.bar(df_patent_details.groupby('year', as_index=False).count(), x="year", y="id", title="Patents per Year citing publications from KSU")
px.scatter(df_patents_for, x="year", y="for_name", hover_name="for_name", marginal_x="histogram", marginal_y="histogram", height=800, title="Research areas of patents (marginal subplots = X/Y totals)")
px.scatter(df_patent_details, x="year", y="times_cited", hover_name="title", hover_data=['id'], facet_col="filing_status", title="Patents per Year VS Timed Cited VS Filing Status")
px.bar(df_patents_assignees.groupby('name', as_index=False).count().sort_values(by="grant_id", ascending=False), x="name", y="grant_id", hover_name="name", height=400, title="Assignees by No of Patents")
px.scatter(df_patents_assignees, x="grant_year", y="name", color="country_name", hover_name="name", hover_data=["id"], height=800, title="Assignees By Country and Year")
Check out the Dimensions API Lab website, which contains many tutorials and reusable Jupyter notebooks for scholarly data analytics.